library(readr)
library(tidyverse)
library(ggplot2)
library(janitor)
library(tidycensus)
library(viridis) 
library(tscount)
library(yrbss)
library(zoo)
library(purrr)
import_raw_data = FALSE

if(import_raw_data){

ss = read_csv("../../data_1/school_shootings.csv")
colnames(ss) = ss[1,]
ss = ss[-1,]%>%
  clean_names()%>%
  select(-c(na,na_2))%>%
  filter(reliability_score_1_5!=1)

ss2 = read_csv("../../data_1/ss_pt2.csv")%>%
  select(-X1)
colnames(ss2)=colnames(ss)[34:ncol(ss)]

ss[(1426:nrow(ss)),(34:ncol(ss))]=ss2

ss$shooter_age = as.numeric(ss$shooter_age)

write.csv(ss,"ss_data.csv")
}

Read in and clean data

# ss is csv file with school shooting data (individual incidents)
ss = read_csv("ss_data.csv")%>%
  select(-X1)%>%
  mutate(year = as.numeric(substr(date, (nchar(date)-3),nchar(date))))%>%
  mutate(state = ifelse(state == "St. Croix, US Virgin Islands","St.Croix",state))%>%
  distinct_at(.vars = c(1:24), .keep_all = T)

Population data

# data set to go from state names to abbreviations

states = data.frame(state.name,state.abb)
dc = data.frame(matrix(ncol= 2, nrow = 1))
colnames(dc)=colnames(states)
dc$`state.name` = "District of Columbia"
dc$`state.abb` = "DC"

states = rbind(states,dc)
states$state.name = as.character(states$state.name)

if(import_raw_data){

# 1980-1990
pop8090_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt" )%>%data.frame()
pop8090_raw = pop8090_raw[c(8:58,64:114),]%>%data.frame()
colnames(pop8090_raw)="var"
pop8090 = data.frame(pop8090_raw$var[1:51],pop8090_raw$var[52:102])

colnames(pop8090)=c("var1","var2")


pop8090 = pop8090%>%
  separate(var1, c("state",as.character(seq(1980,1984,by = 1))), extra = "merge")%>%
  separate(var2, c("state1",as.character(seq(1985,1990,by = 1))), extra = "merge")%>%
  select(c(state,starts_with("19")))
  
# 1970 - 1980
pop7080_raw = read.delim("https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st7080ts.txt")
pop7080_raw = pop7080_raw[c(10:61,63:114),]%>%data.frame()
colnames(pop7080_raw)="var"
pop7080 = data.frame(pop7080_raw$var[1:51],pop7080_raw$var[53:103])

colnames(pop7080)=c("var1","var2")

pop7080 = pop7080%>%
  separate(var1, c("blnk","id","state","1970","1971","1972","1973","1974","1975"), extra = "merge")%>%
  separate(var2, c("blnk1","id1","state1",as.character(seq(1976,1980,by = 1))), extra = "merge")%>%
  select(c(state,starts_with("19")))

# 1990-2000
pop90_00 = read_csv("../../data/pop_90-00.csv")%>%
  rename(state = X1)%>%
  select(c(state,3:13))%>%
  filter(state != "USA")
colnames(pop90_00)[2:ncol(pop90_00)]=as.character(seq(1990,2000,by = 1))
pop90_00$state[30:35] = states$state.name[29:34]
pop90_00$state[40:42] = states$state.name[39:41]
pop90_00$state[9] = states$state.name[51]
pop90_00$state[49] = states$state.name[48]

# 2000-2010
pop00_10 = read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/st-est00int-agesex.csv")%>%
  clean_names()%>%
  filter(sex == 0, age == 999)%>%
  select(c(name,starts_with("popestimate")))%>%
  rename_at(vars(starts_with("popestimate")),funs(substr(.,start = 12, stop = 15)))%>%
  rename(state = name)%>%
  mutate(state = as.character(state))%>%
  mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))%>%
  filter(state != "United States")

# 2010-2018
pop10_18 = read_csv("../../data/pop_2010-2018.csv")
colnames(pop10_18)=pop10_18[1,]

pop10_18 = pop10_18%>%
  slice(-1)%>%
  rename_at(vars(starts_with("Pop")), funs(substr(.,start = 38,stop = 41)))%>%
  rename(state = Geography)%>%
  select(c(state,starts_with("20")))%>%
  mutate(state = ifelse(state=="District of Columbia",as.character(states$state.name[51]),state))

# 2019

pop2019 = read_csv("../../data/2019pop.csv")%>%
  clean_names()%>%
  select(c(state,pop))%>%
  rename("2019"=pop)



pops = left_join(pop7080,pop8090%>%select(-c("1980")), by = "state")%>%
  left_join(states,by = c("state"="state.abb"))%>%
  mutate(state = state.name)%>%
  select(-state.name)%>%
  left_join(pop90_00%>%select(-c("1990")), by = "state")%>%
  left_join(pop00_10%>%select(-c("2000")), by = "state")%>%
  left_join(pop10_18%>%select(-c("2010")), by = "state")%>%
  left_join(pop2019,by = "state")%>%
  gather(key = "year", value = "population", c(2:51))


write.csv(pops, "state_populations.csv")

}

Read in population data

# read in population data (include abbreviated and full state name)
pops = read_csv("state_populations.csv")%>%
  select(-X1)%>%
  left_join(states, by = c("state"="state.name"))

Incident count format

sts = unique(ss$state)
yrs = seq(range(ss$year)[1],range(ss$year)[2],by=1)

st_yr = data.frame(state = rep(sts,length(yrs)), year = sort(rep(yrs,length(sts))))%>%
  arrange(state)

## data set aggrgated by state and year
ss_counts = ss%>%
  group_by(state,year) %>%
  summarize(incident_count = n(), 
            total_victims = sum(total_injured_killed_victims), 
            total_fatalities = sum(killed_includes_shooter),
            total_wounded = sum(wounded),
            avg_victims = mean(total_injured_killed_victims),
            avg_fatalities = mean(killed_includes_shooter),
            ave_wounded = mean(wounded),
            avg_shooter_age = mean(shooter_age),
            max_shooter_age = max(shooter_age),
            min_shooter_age = min(shooter_age),
            avg_reliability = mean(reliability_score_1_5),
            target = mean(ifelse(targeted_specific_victim_s=="Y",1,
                                 ifelse(targeted_specific_victim_s=="N",0,NA)),na.rm = T),
            random_vict = mean(ifelse(random_victims=="Y",1,
                                      ifelse(random_victims=="N",0,NA)),na.rm = T),
            bullied = mean(ifelse(bullied_y_n_n_a=="Y",1,
                                  ifelse(bullied_y_n_n_a=="N",0,NA)),na.rm = T),
            domestic_violence = mean(ifelse(domestic_violence_y_n=="Y",1,
                                            ifelse(domestic_violence_y_n=="N",0,NA)),na.rm = T))%>%
  mutate(in_ss = TRUE)%>%
  full_join(st_yr, by = c("state","year"))%>%
  left_join(pops%>%rename(full_state_name = state), by = c("state"="state.abb", "year"))%>%
  mutate(incident_count = ifelse(is.na(incident_count),0,incident_count))%>%
  mutate(in_ss = ifelse(is.na(in_ss),FALSE,in_ss))

Grade data

# read in state grade data

if(import_raw_data){
grades18 = read_csv("../../data/2018grades.csv")%>%
  select(-c(X6,X7))%>%
  clean_names()%>%
  select(-gun_death_rate_per_100k)%>%
  mutate(year = "2018")%>%
  rename(grade = x2018_grade)

grades17 = read_csv("../../data/2017grades.csv")%>%
  clean_names()%>%
  mutate(year = "2017")%>%
  rename(grade = x2017_grade)

grades16 = read_csv("../../data/2016grades.csv")%>%
  clean_names()%>%
  mutate(year = "2016")%>%
  rename(grade = x2016_grade)


grades15 = read_csv("../../data/2015grades.csv")%>%
  clean_names()%>%
  mutate(year = "2015")%>%
  rename(grade = x2015_grade)

grades14 = read_csv("../../data/2014grades.csv")%>%
  clean_names()%>%
  mutate(year = "2014")%>%
  rename(grade = x2014_grade)

grades13 = read_csv("../../data/2013grades.csv")%>%
  clean_names()%>%
  mutate(year = "2013")%>%
  rename(grade = x2013_grade)

grades12 = read_csv("../../data/2012grades.csv")%>%
  clean_names()%>%
  mutate(year = "2012")%>%
  rename(grade = x2012_grade)


# combine all year data 

grades = bind_rows(grades18,grades17)%>%
  bind_rows(grades16)%>%
  bind_rows(grades15)%>%
  bind_rows(grades14)%>%
  bind_rows(grades13)%>%
  bind_rows(grades12)

write.csv(grades,"state_grades.csv")
}

Read in grade data

grades = read_csv("state_grades.csv")%>%
  select(-X1)

gpa_convert = data.frame(letter_grade = c("A","A-","B+","B","B-",
                                          "C+","C","C-","D+","D","D-","F"),
                         gpa = c(4,3.7,3.3,3,2.7,2.3,2,1.7,1.3,1,0.7,0))

ss_counts = ss_counts%>%
  left_join(grades%>%select(state,grade,year),by = c("full_state_name"="state","year"))%>%
  left_join(gpa_convert, by = c("grade"="letter_grade"))

Media data

media = read_csv("media_data.csv")%>%
  separate(`Year-Month`, into = c("year","month"), sep = "-")%>%
  clean_names()%>%
  select(-starts_with("x"))%>%
  group_by(year) %>%
  mutate(yearly_articles = sum(articles_shootings_excluding_firearm_laws_and_regulations))%>% 
  mutate(yearly_shootings = sum(mass_shooting))%>%
  mutate(art_per_inc = yearly_articles/yearly_shootings)%>%
  ungroup()%>%
  mutate(prev_month_art = c(0,articles_shootings_excluding_firearm_laws_and_regulations[1:227]))%>%
  mutate(prev_year_art = c(rep(0,12),yearly_articles[1:216]))%>%
  mutate(year = as.numeric(year))


ss_counts = ss_counts%>%
  left_join(media%>%select(prev_year_art, year,yearly_articles)%>%distinct(), by = "year")

Poverty data

pov=read.csv("census_poverty_data.csv",stringsAsFactors = FALSE)
pov=filter(pov,State!="      United States"&State!="     United States"& State!="District of Columbia"&State!="United States")

names(pov)[1:2]<-c("state","poverty")
names(pov)[4]<-"year"

skel=expand_grid(sort(unique(pov$state)),min(pov$year):(max(pov$year)+2))
names(skel)<-c("state","year")



sk1=filter(skel,year==2007|year==2008|year==2009)
sk1=left_join(sk1,filter(pov,year==2007),by="state")

sk2=filter(skel,year==2010|year==2011|year==2012)
sk2=left_join(sk2,filter(pov,year==2010),by="state")

sk3=filter(skel,year==2013|year==2014|year==2015)
sk3=left_join(sk3,filter(pov,year==2013),by="state")

sk4=filter(skel,year==2016|year==2017|year==2018)
sk4=left_join(sk4,filter(pov,year==2016),by="state")

pov=rbind(sk1,sk2,sk3,sk4)
pov=pov[,1:3]
names(pov)<-c("state","year","poverty")

Merge poverty data

pov = left_join(pov,states, by = c("state"="state.name"))%>%
  select(-state)%>%
  rename(state = state.abb)
ss_counts = left_join(ss_counts, pov, by = c("state","year"))

Bullying data

bully = read_csv("State_bullying.csv")%>%
  select(c(total_score,state.abb))%>%
  rename(state = state.abb)%>%
  mutate(year = 2010)

ss_counts = left_join(ss_counts, bully%>%select(-year), by = "state")%>%
  rename(bullying_score = total_score)%>%
  mutate(bullying_score = as.numeric(bullying_score))

Mental Health data

mh.data=read.csv("mh.data.csv",stringsAsFactors = FALSE)%>%
  clean_names()%>%
  select(-x)%>%
  left_join(states, by = c("state"="state.name"))%>%
  select(-state)%>%
  rename(state = state.abb)

ss_counts = ss_counts%>%
  left_join(mh.data, by = c("state","year"))%>%
  rename(mh_score = percent)
## Warning: Column `state` joining character vector and factor, coercing into
## character vector
# Rand Laws Data

data.rand=read.csv("rand.laws.database.csv",stringsAsFactors = FALSE)

CDC Bullying Data

qns = c("qn13","qn14","qn15","qn16","qn17","qn18","qn19","qn20","qn24","qn25","qn26","qn27","qn28","qn29","qn30","qnbullyweight","qnbullygay")
sts = yrbss::getListOfParticipatingStates()
yrs = c(1991,1993,1995,1997,1999,2001,2003,2005,2007,2009,2011,2013,2015)

if(import_raw_data){
bullying = data.frame(variable = NULL, state = NULL, year = NULL, prop= NULL, ciLB= NULL, ciUB = NULL, n = NULL)

for(v in qns){
  for(s in sts){
    for(y in yrs ){
      p = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$prop
    ciL = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciLB
    ciU = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$ciUB
    sample_size = yrbss::getProportionSingleVariable(.variable = v, .location = s, .year = y, .level = 0.95)$n
    newrow = data.frame(variable = v, state = s, year = y, prop= p, ciLB= ciL, ciUB = ciU, n = sample_size)
    bullying = bind_rows(bullying,newrow)}
}}

lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
   filter(question %in% qns)
 
bullying = left_join(bullying,lab, by = c(variable = "question"))
bullying$state[which(bullying$state=="AZB")]="AZ"
write_csv(bullying, "bullying_survey.csv")
}

lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
   filter(question %in% qns)
# read in csv
bullying_survey = read_csv("bullying_survey.csv")
## Parsed with column specification:
## cols(
##   variable = col_character(),
##   state = col_character(),
##   year = col_double(),
##   prop = col_double(),
##   ciLB = col_double(),
##   ciUB = col_double(),
##   n = col_double(),
##   label = col_character()
## )
bullying_to_merge = read_csv("bullying_survey.csv")%>%
  select(-c(ciLB,ciUB,n,label))%>%
  gather(key = stat_type, value = value, prop)%>%
  spread(variable, value)%>%
  select(-stat_type)
## Parsed with column specification:
## cols(
##   variable = col_character(),
##   state = col_character(),
##   year = col_double(),
##   prop = col_double(),
##   ciLB = col_double(),
##   ciUB = col_double(),
##   n = col_double(),
##   label = col_character()
## )
colnames(bullying_to_merge)[3:17]=as.character(lab$label[1:15])

bullying_to_merge = bullying_to_merge%>%
  clean_names()

ss_counts = left_join(ss_counts, bullying_to_merge, by = c("state","year"))


# check missingness for 2009-2015
sum(!complete.cases(filter(bullying_survey, year>=2009)))/nrow(filter(bullying_survey, year>=2009))
## [1] 0.2949126
### We're going to need to impute values for missing years and states for some of these variables if these data are going to be useful in our analysis

Make combined variables for survey questions

# combine suicide questions 
suicide = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
suicide_qns = bullying_survey%>%
  filter(variable %in% c("qn26","qn27","qn28","qn29","qn30"))

for(y in yrs){
  for(st in sts){
    dat = filter(suicide_qns,(year ==y & state == st))
    suicide_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
    samp = mean(dat$n,na.rm = TRUE)
    newrow = data.frame(year = y, state = st, score = suicide_score, n = samp)
    suicide = bind_rows(suicide,newrow)
  }
}

# combine questions involving physical fights
fight = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
fight_qns = bullying_survey%>%
  filter(variable %in% c("qn18","qn19","qn20"))

for(y in yrs){
  for(st in sts){
    dat = filter(fight_qns,(year ==y & state == st))
    fight_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
    samp = mean(dat$n,na.rm = TRUE)
    newrow = data.frame(year = y, state = st, score = fight_score, n = samp)
    fight = bind_rows(fight,newrow)
  }
}

# school safety questions
safety = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
safety_qns = bullying_survey%>%
  filter(variable %in% c("qn13","qn14","qn15","qn16","qn17"))

for(y in yrs){
  for(st in sts){
    dat = filter(safety_qns,(year ==y & state == st))
    safety_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
    samp = mean(dat$n,na.rm = TRUE)
    newrow = data.frame(year = y, state = st, score = safety_score, n = samp)
    safety = bind_rows(safety,newrow)
  }
}

# bullying questions
bull = data.frame(year = NULL, state = NULL, score = NULL, n = NULL)
bully_qns = bullying_survey%>%
  filter(variable %in% c("qn24","qn25","qnbullyweight","qnbullygay"))

for(y in yrs){
  for(st in sts){
    dat = filter(bully_qns,(year ==y & state == st))
    bully_score = weighted.mean(dat$prop,1/(dat$n), na.rm = TRUE)
    samp = mean(dat$n,na.rm = TRUE)
    newrow = data.frame(year = y, state = st, score = bully_score, n = samp)
    bull = bind_rows(bull,newrow)
  }
}


survey_combined = bind_cols(suicide,safety, fight, bull)%>%
  select(year, state, score, n, score1, n1, score2, n2, score3, n3)%>%
  rename(suicide_score = score, school_safety_score = score1, fight_score = score2, bully_score = score3)%>%
  filter(year>=2009)%>%
  full_join(st_yr%>%filter(year>=2009))%>%
  arrange(year)%>%
  arrange(state)%>%
  mutate(state = ifelse(state == "AZB","AZ",state))

survey_combined = survey_combined%>%
  group_by(state)%>%
  mutate(suicide_score =ifelse(year<2016,ifelse(is.na(suicide_score), na.locf(suicide_score)/2+na.locf(suicide_score, fromLast = TRUE)/2,suicide_score),
                              NA ))%>%
  mutate(school_safety_score =ifelse(year<2016,ifelse(is.na(school_safety_score), na.locf(school_safety_score)/2+na.locf(school_safety_score, fromLast = TRUE)/2,school_safety_score),NA))%>%
    mutate(fight_score =ifelse(year<2016,ifelse(is.na(fight_score), na.locf(fight_score)/2+na.locf(fight_score, fromLast = TRUE)/2,fight_score),NA))%>%
      mutate(bully_score =ifelse(year<2016,ifelse(is.na(bully_score), na.locf(bully_score)/2+na.locf(bully_score, fromLast = TRUE)/2,bully_score),NA))%>%
        mutate(n=ifelse(year<2016,ifelse(is.na(n), na.locf(n)/2+na.locf(n, fromLast = TRUE)/2,n),NA))%>%
          mutate(n1=ifelse(year<2016,ifelse(is.na(n1), na.locf(n1)/2+na.locf(n1, fromLast = TRUE)/2,n1),NA))%>%
          mutate(n2=ifelse(year<2016,ifelse(is.na(n2), na.locf(n2)/2+na.locf(n2, fromLast = TRUE)/2,n2),NA))%>%       
  mutate(n3=ifelse(year<2016,ifelse(is.na(n3), na.locf(n3)/2+na.locf(n3, fromLast = TRUE)/2,n3),NA))


# use fitted line to get 2016-2019 values 
for(s in sts[which(!(sts %in% c("MA","AZB")))]){
  dat = filter(survey_combined, state ==s)
  ind = which(is.na(dat$suicide_score))[1]
  slp = dat[ind-1,3:10]-dat[ind-2,3:10]
  dat[which(is.na(dat$suicide_score))[1],3:10] =  dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp
    dat[which(is.na(dat$suicide_score))[1],3:10] =  dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp  
    dat[which(is.na(dat$suicide_score))[1],3:10] =  dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp  
    dat[which(is.na(dat$suicide_score))[1],3:10] =  dat[(ind-1),3:10]+as.numeric(dat[which(is.na(dat$suicide_score))[1],1]-2015)*slp

  survey_combined[which(survey_combined$state == s),]=dat
}

EDA

table(ss$reliability_score_1_5)%>%
  data.frame()%>%
  ggplot(aes(x = Var1, y = Freq))+
  geom_bar(aes(fill = Var1), stat = "identity")+
  scale_fill_discrete(name = "Reliability Score")+
  labs(x = "Reliability Score", y = "Count")

table(ss$school_type)%>%
  data.frame()%>%
  ggplot(aes(x = Var1, y = Freq))+
  geom_bar(aes(fill = Var1), stat = "identity")+
  scale_fill_discrete(name = "School Type")+
  labs(x = "School Type", y = "Count")

ss%>%
  filter(shooter_age<200)%>%
  ggplot(aes(x = reorder(state, shooter_age, FUN = mean), y = shooter_age))+
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
  labs(y = "Age of shooter", x = "State", title = "Age of Shooter by State")

ss%>%
  ggplot(aes(x = reorder(state, killed_includes_shooter, FUN = mean), y = killed_includes_shooter))+
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
  labs(y = "Numbeer Killed (including shooter)", x = "State", title = "Fatalities")

ss%>%
  ggplot(aes(x = reorder(state, wounded, FUN = mean), y = total_injured_killed_victims))+
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
  labs(y = "Wounded ", x = "State", title = "Number Wounded")

ss%>%
  ggplot(aes(x = reorder(state, total_injured_killed_victims, FUN = mean), y = total_injured_killed_victims))+
  geom_boxplot()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))+
  labs(y = "Wounded or Killed", x = "State", title = "Combined Wounded & Fatalities")

table(ss$state)%>%
  data.frame()%>%
  ggplot(aes(x = Var1, y = Freq))+
  geom_bar(aes(x = reorder(Var1,Freq, desc)), stat = "identity")+
  labs(x = "State", y = "Count", "Number of incidents by State (overall)")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Grade plots

cc = scales::seq_gradient_pal("green", "red", "Lab")(seq(0,1,length.out=12))



# This will handle the ordering
d <- ss_counts %>% 
  filter(year %in% seq(2012,2018,by=1))%>%
  ungroup() %>%   # As a precaution / handle in a separate .grouped_df method
  arrange(year, incident_count) %>%   # arrange by facet variables and continuous values
  mutate(.r = row_number()) # Add a row number variable

ggplot(d, aes(x = .r, y= incident_count, fill = grade)) +  # Use .r instead of x
  geom_point(data = d%>%filter(incident_count==0),
             aes(x = .r, y= incident_count, color = grade),
             size = 0.7) +
  geom_bar(stat = "identity")+
  facet_wrap(~ year, scales = "free_x") +  # Should free scales (though could be left to user)
  scale_x_continuous(  # This handles replacement of .r for x
    breaks = d$.r,     # notice need to reuse data frame
    labels = d$state
  )+
  scale_fill_manual(values = cc, name = "Grade")+
  scale_color_manual(values = cc, guide = FALSE)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(0.7,.15), legend.direction = "horizontal",legend.key.size = unit(1.5,"line"), legend.text = element_text(size = 8), legend.title = element_text(size = 10))

ss_counts%>%
  filter(year %in% seq(2012,2018,by=1))%>%
  filter(!is.na(grade))%>%
  ggplot(aes(x = incident_count , fill = grade))+
  geom_histogram(binwidth = 1)+
  facet_wrap(.~year)+
  scale_fill_manual(values = cc)+
  scale_x_continuous(breaks = seq(0,35,by=5))+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5))+
  theme_bw()+
  labs(x= "Number of School Shootings")

ss_counts%>%
  filter(year %in% seq(2012,2018,by=1))%>%
  filter(!is.na(grade))%>%
  ggplot(aes(x = grade, y = incident_count, color = grade))+
  geom_boxplot()+
  facet_wrap(.~year)+
  theme_bw()+
  scale_color_manual(values = cc)+
  labs(y= "Number of School Shootings", x = "Grade")

ss_counts%>%
  filter(year %in% seq(2012,2018,by=1))%>%
  ggplot()+
  geom_bar(aes(x = reorder(state,population), y = incident_count), stat = "identity")+
  geom_point(aes(x = state, y = log(population), color = "population"), size = .4)+
  theme_bw()+
  labs(x = "State", y = "Number of School Shootings", title = "Number of School Shooting and Population by State")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3.5), legend.position = c(.7, 0), legend.justification = c(1, 0))+
  facet_wrap(.~year)+
  scale_color_discrete(name = NULL, labels = "Population (log)")

ss_counts%>%
  filter(year %in% seq(2012,2018,by=1))%>%
  group_by(state) %>%
  mutate(ordr = mean(avg_victims, na.rm = T))%>%
  filter(ordr!="NaN")%>%
  ungroup()%>%
  arrange(desc(ordr))%>%
  ggplot(aes(x = reorder(state,ordr), y = avg_victims, color = year))+
  geom_point(size = 0.6, position = "jitter")+
  labs(x = "State", y = "Average number of victims per incident", title ="Victims per School Shooting")+
  scale_color_viridis(option = "D")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

Media Plots

media%>%
  ggplot(aes(x = prev_month_art, y = mass_shooting, color = as.numeric(year)))+geom_point(position = "jitter")+
  scale_color_continuous(name = "Year")+
  labs(y = "Mass Shooting Count", x = "Articles in Previous Month")+
  theme_bw()

media%>%
  ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations, x= as.numeric(year), color = as.numeric(year)))+
  geom_point()+
  geom_jitter()+
  labs(y = "Monthly Articles",x =  "Year")+
  theme(legend.position = "none")+
  theme_bw()

media%>%
  ggplot(aes(y = articles_shootings_excluding_firearm_laws_and_regulations/yearly_shootings, x= as.numeric(year), color = as.numeric(year)))+
  geom_point()+
  geom_jitter()+
  labs(y = "Monthly Articles/Yearly Mass Shootings",x =  "Year")+
  theme(legend.position = "none")+
  theme_bw()

media%>%
  ggplot(aes(y = yearly_articles, x= as.numeric(year)))+ 
  geom_point()+
  labs(y = "Yearly Articles",x =  "Year")+
  theme_bw()

school_counts = ss%>%
  select(school,city,state)%>%
  unite(col = school_city,c(school,city,state), sep = "_")%>%
  table(dnn = c("school","count"))%>%
  data.frame()%>%
  separate(col = school_city, into = c("school","city","state"), sep = "_" )


delta = data.frame(state = NULL, year = NULL, delta_gpa = NULL, delta_ss = NULL, delta_pop = NULL)
for(x in unique(ss_counts$state)){
  dat = filter(ss_counts, state == x)%>%
    arrange(year)
  chng_gpa = c(NA,diff(dat$gpa))
  chng_ss = c(NA, diff(dat$incident_count))
  chng_pop = c(NA, diff(dat$population))
  st = rep(x, nrow(dat))
  newrows =data.frame(state = st, year = dat$year, delta_gpa = chng_gpa, delta_ss = chng_ss, delta_pop= chng_pop)
  delta = bind_rows(delta,newrows)
  
}
ss_counts%>%
  group_by(year)%>%
  summarise(yearly_incidents = sum(incident_count), prev_year_art = prev_year_art[1] )%>%
  ggplot(aes(x = year, y = yearly_incidents, fill = prev_year_art))+geom_bar(stat = "identity")+
  labs(y = "Yearly School Shootings", x = "Year")+
  scale_fill_viridis(option = "D", name = "Articles in previous year")+
  theme_bw()

Bullying survey plots

ggplot(data= bullying_survey%>%filter(year>=2013), aes(x = year, y = prop, grouby_by = state, color = variable))+
  geom_line(position = "jitter")+
  scale_color_viridis_d()+
  theme_bw()

ggplot(data= bullying_survey%>%filter(variable %in% qns[9:17]), aes(x = year, y = prop, color = variable))+
  geom_point(position = "jitter")+
  geom_smooth(se=F)+
  scale_color_viridis_d()+
  theme_bw()

lab = data.frame(question = yrbss::yrbss_questions_binary$variable, label = yrbss::yrbss_questions_binary$label)%>%
   filter(question %in% qns)

kableExtra::kable(lab, byrow = F, caption = "Questions topics")
Questions topics
question label
qn13 Carried a weapon
qn14 Carried a gun
qn15 Carried a weapon on school property
qn16 Did not go to school because they felt unsafe at school or on their way to or from school
qn17 Were threatened or injured with a weapon on school property
qn18 Were in a physical fight
qn19 Were injured in a physical fight
qn20 Were in a physical fight on school property
qn24 Were bullied on school property
qn25 Were electronically bullied
qn26 Felt sad or hopeless
qn27 Seriously considered attempting suicide
qn28 Made a plan about how they would attempt suicide
qn29 Attempted suicide
qn30 Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse
qnbullyweight Been teased b/c of weight past 12 mos
qnbullygay Been teased b/c labeled GLB past 12 mos
bullying_survey%>%
  slice(which(complete.cases(bullying_survey)))%>%
  select(label)%>%
  table()%>%
  data.frame()%>%
  kableExtra::kable(caption = "Observations per Question")
Observations per Question
. Freq
Attempted suicide 298
Attempted suicide that resulted in an injury, poisoning, or overdose that had to be treated by a doctor or nurse 282
Been teased b/c labeled GLB past 12 mos 18
Been teased b/c of weight past 12 mos 18
Carried a gun 231
Carried a weapon 275
Carried a weapon on school property 292
Did not go to school because they felt unsafe at school or on their way to or from school 298
Felt sad or hopeless 251
Made a plan about how they would attempt suicide 291
Seriously considered attempting suicide 308
Were bullied on school property 124
Were electronically bullied 94
Were in a physical fight 298
Were in a physical fight on school property 294
Were injured in a physical fight 270
Were threatened or injured with a weapon on school property 291

Model Fitting

fit = glm(incident_count~gpa+poverty+bullying_score+mh_score+ prev_year_art+year+offset(log(population)), family = "poisson", data = ss_counts)

if(FALSE){
ts_y = ts(ss_counts%>%slice(which(complete.cases(ss_counts)))%>%select(incident_count))
ts_x = ss_counts%>%
    ungroup()%>%
  slice(which(complete.cases(ss_counts)))%>%
  select(c(gpa,poverty,bullying_score,yearly_articles,population))%>%
  mutate(population = log(population))%>%
  as.matrix()

ts_fit = tsglm(ts_y, xreg = ts_x, model = list(past_obs = 1, external = TRUE), link = "log", distr = "poisson")

best_fit = forecast::auto.arima(diff(ts_y), xreg = diff(ts_x))

astsa::sarima(diff(ts_y),p = 0, d = 0, q =0, xreg = diff(ts_x))
arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef
sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))

cilb= arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef-sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))

ciub = arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x))$coef+sqrt(diag(vcov( arima(diff(ts_y),order = c(0,0,0), xreg = diff(ts_x)))))
data.frame(Lower= cilb, Upper = ciub)
}